library(ggplot2)
library(lubridate)
library(stringr)
library(stargazer)
library(here)
library(sentimentr)

setwd(here("release_data","SOTU"))

############################
############################
# GOOGLE SEARCHES
############################
############################

dates <- read.csv("sotu_dates_new.csv")

#### REPORT ####
setwd(here("release_data","SOTU","report"))

files<- list.files()

fl <- data.frame(files=files, num = as.numeric(gsub("\\D","",files)))

fl <- fl[order(fl$num),]
full <- NULL

for(i in 1:length(fl$files)){
  tmp <- read.csv(as.character(fl$files[i]))
  tmp$pull <- i
  
  full <-rbind(full, tmp)
}


full$date2 <- as.Date(full$date)

colnames(full) <- c("hour","immigr","partial","pull","date")


#analysis
full$speech <- "Obama sotu 15"
full$speech[full$pull %in% c(1,7,13)]<- "Trump 2019 speech"
full$speech[full$pull %in% c(2,8,14)]<- "Trump 2019 sotu"
full$speech[full$pull %in% c(3,9,15)]<- "Trump 2018 sotu"
full$speech[full$pull %in% c(4,10,16)]<- "Trump 2017 sotu"
full$speech[full$pull %in% c(5,11,17)]<- "Obama 2016 sotu"
full$speech[full$pull %in% c(6,12,18)]<- "Obama 2015 sotu"

#drop the 2019 immigration speech
full <- full[full$speech!="Trump 2019 speech",]

#convert times

hours <- as.POSIXct(full$hour, tz = "UTC")

hours <- with_tz(hours, "EST")

full$est_hours <- as.character(hours)
full$est_date <- as.Date(full$est_hours)

# get the dates we want

dates$sotu_date <- as.character(dates$sotu_date)
dates$mo <- str_pad(as.numeric(str_extract(dates$sotu_date,"^\\d+")),width=2, side="left",pad="0")
dates$day <- str_pad(as.numeric(gsub("/","",str_extract(dates$sotu_date,"/\\d+/"))),width=2, side="left",pad="0")
dates$yr <- as.numeric((str_extract(dates$sotu_date,"\\d+$")))

dates$date <- paste0(dates$yr,"-",dates$mo,"-",dates$day)
full <- full[full$est_date %in% as.Date(dates$date),]

full$num_hour <- 0:23

full$speech_time <- ifelse(full$num_hour %in% c(21,22),1,0)

#immigration oval office speech lasted less than one hour
full$speech_time[full$speech=="Trump 2019 speech"&full$num_hour==22]<- 0

full$admin <- ifelse(full$date>as.Date("2017-01-20"),"Trump","Obama")

#look at means

full$speech_date <- ifelse(full$pull<7,1,0)
full$before_after <- "1 week-pre"
full$before_after[full$pull<7]<- "Speech date"
full$before_after[full$pull>=12]<- "1 week-post"

agg <- aggregate(full$immigr, list(full$admin, full$speech_time, full$speech_date), mean)
colnames(agg) <- c("admin","speech_time","speech_date","mean")

agg2 <- aggregate(full$immigr, list(full$admin, full$num_hour, full$before_after), mean)

colnames(agg2) <- c("admin","num_hour","speech_date","mean")

######## FIGURE 5 - REPORT #######
agg2$speech_date <- factor(agg2$speech_date, levels=c("1 week-pre","Speech date","1 week-post"), ordered=T)
ggplot(agg2, aes(x=num_hour, y=mean, color=factor(speech_date)))+geom_point()+facet_wrap(speech_date~admin, ncol=2)+theme(legend.position = "bottom", legend.title=element_blank())+scale_color_manual(values=c("grey50","firebrick3","grey10"))+ylab("Google Trends")+geom_vline(xintercept=21, linetype="dashed")+xlab("Hour (EST)") 

######## TABLE 8 - REPORT #######
trump_r<- lm(immigr~speech_date+speech_time+speech_date:speech_time, data=full[full$admin=="Trump",])
trump_r

obama_r <- lm(immigr~speech_date+speech_time+speech_date:speech_time, data=full[full$admin=="Obama",])

#### CRIME ####
setwd(here("release_data","SOTU","crime"))

files<- list.files()

fl <- data.frame(files=files, num = as.numeric(gsub("\\D","",files)))

fl <- fl[order(fl$num),]
full <- NULL

for(i in 1:length(fl$files)){
  tmp <- read.csv(as.character(fl$files[i]))
  tmp$pull <- i
  
  full <-rbind(full, tmp)
}


full$date2 <- as.Date(full$date)

colnames(full) <- c("hour","immigr","partial","pull","date")


#analysis
full$speech <- "Obama sotu 15"
full$speech[full$pull %in% c(1,7,13)]<- "Trump 2019 speech"
full$speech[full$pull %in% c(2,8,14)]<- "Trump 2019 sotu"
full$speech[full$pull %in% c(3,9,15)]<- "Trump 2018 sotu"
full$speech[full$pull %in% c(4,10,16)]<- "Trump 2017 sotu"
full$speech[full$pull %in% c(5,11,17)]<- "Obama 2016 sotu"
full$speech[full$pull %in% c(6,12,18)]<- "Obama 2015 sotu"

#drop the 2019 immigration speech
#full <- full[full$speech!="Trump 2019 speech",]

#convert times

hours <- as.POSIXct(full$hour, tz = "UTC")

hours <- with_tz(hours, "EST")

full$est_hours <- as.character(hours)
full$est_date <- as.Date(full$est_hours)

# get the dates we want

dates$sotu_date <- as.character(dates$sotu_date)
dates$mo <- str_pad(as.numeric(str_extract(dates$sotu_date,"^\\d+")),width=2, side="left",pad="0")
dates$day <- str_pad(as.numeric(gsub("/","",str_extract(dates$sotu_date,"/\\d+/"))),width=2, side="left",pad="0")
dates$yr <- as.numeric((str_extract(dates$sotu_date,"\\d+$")))

dates$date <- paste0(dates$yr,"-",dates$mo,"-",dates$day)
full <- full[full$est_date %in% as.Date(dates$date),]

full$num_hour <- 0:23

full$speech_time <- ifelse(full$num_hour %in% c(21,22),1,0)

#immigration oval office speech lasted less than one hour
full$speech_time[full$speech=="Trump 2019 speech"&full$num_hour==22]<- 0

full$admin <- ifelse(full$date>as.Date("2017-01-20"),"Trump","Obama")

#look at means
full$speech_date <- ifelse(full$pull<7,1,0)
full$before_after <- "1 week-pre"
full$before_after[full$pull<7]<- "Speech date"
full$before_after[full$pull>=12]<- "1 week-post"

agg <- aggregate(full$immigr, list(full$admin, full$speech_time, full$speech_date), mean)
colnames(agg) <- c("admin","speech_time","speech_date","mean")

agg2 <- aggregate(full$immigr, list(full$admin, full$num_hour, full$before_after), mean)

colnames(agg2) <- c("admin","num_hour","speech_date","mean")

######## FIGURE 5 - CRIME #######
agg2$speech_date <- factor(agg2$speech_date, levels=c("1 week-pre","Speech date","1 week-post"), ordered=T)
ggplot(agg2, aes(x=num_hour, y=mean, color=factor(speech_date)))+geom_point()+facet_wrap(speech_date~admin, ncol=2)+theme(legend.position = "bottom", legend.title=element_blank())+scale_color_manual(values=c("grey50","firebrick3","grey10"))+ylab("Google Trends")+geom_vline(xintercept=21, linetype="dashed")+xlab("Hour (EST)") 

############ TABLE 7 - CRIME ##########
trump_c<- lm(immigr~speech_date+speech_time+speech_date:speech_time, data=full[full$admin=="Trump",])
obama_c <- lm(immigr~speech_date+speech_time+speech_date:speech_time, data=full[full$admin=="Obama",])

#### WELFARE ####
setwd(here("release_data","SOTU","welfare"))

files<- list.files()

fl <- data.frame(files=files, num = as.numeric(gsub("\\D","",files)))

fl <- fl[order(fl$num),]
full <- NULL

for(i in 1:length(fl$files)){
  tmp <- read.csv(as.character(fl$files[i]))
  tmp$pull <- i
  
  full <-rbind(full, tmp)
}


full$date2 <- as.Date(full$date)

colnames(full) <- c("hour","immigr","partial","pull","date")


#analysis
full$speech <- "Obama sotu 15"
full$speech[full$pull %in% c(1,7,13)]<- "Trump 2019 speech"
full$speech[full$pull %in% c(2,8,14)]<- "Trump 2019 sotu"
full$speech[full$pull %in% c(3,9,15)]<- "Trump 2018 sotu"
full$speech[full$pull %in% c(4,10,16)]<- "Trump 2017 sotu"
full$speech[full$pull %in% c(5,11,17)]<- "Obama 2016 sotu"
full$speech[full$pull %in% c(6,12,18)]<- "Obama 2015 sotu"

#drop the 2019 immigration speech
#full <- full[full$speech!="Trump 2019 speech",]

#convert times

hours <- as.POSIXct(full$hour, tz = "UTC")

hours <- with_tz(hours, "EST")

full$est_hours <- as.character(hours)
full$est_date <- as.Date(full$est_hours)

# get the dates we want

dates$sotu_date <- as.character(dates$sotu_date)
dates$mo <- str_pad(as.numeric(str_extract(dates$sotu_date,"^\\d+")),width=2, side="left",pad="0")
dates$day <- str_pad(as.numeric(gsub("/","",str_extract(dates$sotu_date,"/\\d+/"))),width=2, side="left",pad="0")
dates$yr <- as.numeric((str_extract(dates$sotu_date,"\\d+$")))

dates$date <- paste0(dates$yr,"-",dates$mo,"-",dates$day)
full <- full[full$est_date %in% as.Date(dates$date),]

full$num_hour <- 0:23

full$speech_time <- ifelse(full$num_hour %in% c(21,22),1,0)

#immigration oval office speech lasted less than one hour
full$speech_time[full$speech=="Trump 2019 speech"&full$num_hour==22]<- 0

full$admin <- ifelse(full$date>as.Date("2017-01-20"),"Trump","Obama")

#look at means
full$speech_date <- ifelse(full$pull<7,1,0)
full$before_after <- "1 week-pre"
full$before_after[full$pull<7]<- "Speech date"
full$before_after[full$pull>=12]<- "1 week-post"

agg <- aggregate(full$immigr, list(full$admin, full$speech_time, full$speech_date), mean)
colnames(agg) <- c("admin","speech_time","speech_date","mean")

agg2 <- aggregate(full$immigr, list(full$admin, full$num_hour, full$before_after), mean)

colnames(agg2) <- c("admin","num_hour","speech_date","mean")

######## FIGURE 5 - WELFARE #######
agg2$speech_date <- factor(agg2$speech_date, levels=c("1 week-pre","Speech date","1 week-post"), ordered=T)
ggplot(agg2, aes(x=num_hour, y=mean, color=factor(speech_date)))+geom_point()+facet_wrap(speech_date~admin, ncol=2)+theme(legend.position = "bottom", legend.title=element_blank())+scale_color_manual(values=c("grey50","firebrick3","grey10"))+ylab("Google Trends")+geom_vline(xintercept=21, linetype="dashed")+xlab("Hour (EST)") 

####### TABLE 7 - WELFARE ########
trump_w <- lm(immigr~speech_date+speech_time+speech_date:speech_time, data=full[full$admin=="Trump",])
obama_w <- lm(immigr~speech_date+speech_time+speech_date:speech_time, data=full[full$admin=="Obama",])

############################
############################
# EMOTIONS
############################
############################
setwd(here("release_data","SOTU","text"))

fils <- list.files()

full <- NULL

illeg <- rep(NA, length(fils))
immigr <- rep(NA, length(fils))
wall <- rep(NA, length(fils))

ind <- 1
for(fil in fils){
  tmp <- scan(fil, what="character", sep="\n")
  
  tmp <- paste0(tmp, collapse=" ")
  
  num <- length(unlist(str_extract_all(tmp,"illegal immigr|illegals|llegal alien")))
  illeg[ind] <- num
  
  num <- length(unlist(str_extract_all(tmp,"immigr|illegals|llegal alien")))
  immigr[ind] <- num
  
  num <- length(unlist(str_extract_all(tmp,"wall|border")))
  wall[ind] <- num
  
  ind <- ind + 1
  
  tmp <- unlist(strsplit(tmp,split="\\. "))
  
  tmp <- grep("immigr|illegal alien|illegals", tmp,ignore.case=T, value=T)
  
  tmp <- gsub("immigr\\w*|illegal immigr\\w*|illegals|llegal alien\\w*","", tmp, ignore.case = T)
  
  tmp2 <- emotion(tmp)
  tmp2$file <- fil
  
  full <- rbind(full, tmp2)
}

agg <- aggregate(full$emotion_count, list(full$file, full$emotion_type), sum)

colnames(agg) <- c("file","emotion","count")

speech_mins <- c(60,58,60,80,10,82)

df <- data.frame(file=fils, mins=speech_mins)

agg <- merge(agg, df)

agg <- agg[grep("_negated",agg$emotion, invert=T),]

agg$per_min <- agg$count/agg$mins

agg$file <- gsub("_sotu|.txt","",agg$file)
agg$file <- gsub("_"," ",agg$file)

######  FIGURE 6 ######

agg$file <- factor(agg$file, ordered=T, levels=c("2015","2016","2017","2018","2019 oval","2019"))
ggplot(agg[agg$emotion %in% c("anger","fear","sadness","disgust"),], aes(x=file, y=count))+geom_point()+facet_wrap(~emotion)+ylab("Mentions")+xlab(NULL)+geom_vline(xintercept=2.5, linetype="dashed")+annotate("text",x=1.5,y=27, label=c("Obama"))+annotate("text",x=3.5,y=27, label=c("Trump"))


############################
############################
# BING SEARCHES
############################
############################
setwd(here("release_data"))

########## FIGURE A4 ##########
tot_c <- read.csv("crime_speech_proportions.csv")

agg_c <- aggregate(100*tot_c$prop, list(tot_c$hour_p, tot_c$color), mean)

colnames(agg_c) <- c("hour_p","color","pct")

agg_c$hour_p <- as.POSIXct((agg_c$hour_p))
ggplot(agg_c, aes(x=hour_p,y=pct, color=color))+geom_point() +scale_x_datetime(date_breaks = "6 hours",date_labels = "%I:%M %p")+xlab(NULL)+ylab("% Searches")+theme(legend.position = "bottom", legend.title = element_blank())+scale_color_manual(values=c("grey10","grey50","firebrick3"))+geom_vline(xintercept=as.POSIXct("2020-01-15 21:00:00"), linetype="dashed")



tot_w <- read.csv("welfare_speech_proportions.csv")

agg_w <- aggregate(100*tot_w$prop, list(tot_w$hour_p, tot_w$color), mean)

colnames(agg_w) <- c("hour_p","color","pct")

agg_w$hour_p <- as.POSIXct((agg_w$hour_p))
ggplot(agg_w, aes(x=hour_p,y=pct, color=color))+geom_point() +scale_x_datetime(date_breaks = "6 hours",date_labels = "%I:%M %p")+xlab(NULL)+ylab("% Searches")+theme(legend.position = "bottom", legend.title = element_blank())+scale_color_manual(values=c("grey10","grey50","firebrick3"))+geom_vline(xintercept=as.POSIXct("2020-01-15 21:00:00"), linetype="dashed")



tot_r <- read.csv("report_speech_proportions.csv")

agg_r <- aggregate(100*tot_r$prop, list(tot_r$hour_p, tot_r$color), mean)

colnames(agg_r) <- c("hour_p","color","pct")

agg_r$hour_p <- as.POSIXct((agg_r$hour_p))
ggplot(agg_r, aes(x=hour_p,y=pct, color=color))+geom_point() +scale_x_datetime(date_breaks = "6 hours",date_labels = "%I:%M %p")+xlab(NULL)+ylab("% Searches")+theme(legend.position = "bottom", legend.title = element_blank())+scale_color_manual(values=c("grey10","grey50","firebrick3"))+geom_vline(xintercept=as.POSIXct("2020-01-15 21:00:00"), linetype="dashed")



tot_h <- read.csv("hsi_speech_proportions.csv")

agg_h <- aggregate(100*tot_h$prop, list(tot_h$hour_p, tot_h$color), mean)

colnames(agg_h) <- c("hour_p","color","pct")

agg_h$hour_p <- as.POSIXct((agg_h$hour_p))
ggplot(agg_h, aes(x=hour_p,y=pct, color=color))+geom_point() +scale_x_datetime(date_breaks = "6 hours",date_labels = "%I:%M %p")+xlab(NULL)+ylab("% Searches")+theme(legend.position = "bottom", legend.title = element_blank())+scale_color_manual(values=c("grey10","grey50","firebrick3"))+geom_vline(xintercept=as.POSIXct("2020-01-15 21:00:00"), linetype="dashed")
